{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "DataAnalysis"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from tools import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap 3 处理原始文本\n",
    "1.  如何访问文件内的文本？\n",
    "2.  如何将文档分割成单独的单词和标点符号，从而进行文本语料上的分析？\n",
    "3.  如何产生格式化的输出，并把结果保存在文件中？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 3.9 格式化：从链表到字符串(P126)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 3.9.1 从链表转换为字符串"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "We called him Tortoise because he taught us .\nWe;called;him;Tortoise;because;he;taught;us;.\nWecalledhimTortoisebecausehetaughtus.\n"
     ]
    }
   ],
   "source": [
    "silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']\n",
    "print(' '.join(silly))\n",
    "print(';'.join(silly))\n",
    "print(''.join(silly))"
   ]
  },
  {
   "source": [
    "### 3.9.2 字符串显示方式（两种）"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "cat\nb'cat'\n"
     ]
    }
   ],
   "source": [
    "word = 'cat'\n",
    "print(word)\n",
    "print(word.encode('utf-8'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "hello \nworld\nb'hello \\nworld'\n"
     ]
    }
   ],
   "source": [
    "# print()函数按文本输出的格式输出，sentence或者 sentence.encode()则按字符串具体的内容输出\n",
    "sentence = \"\"\"hello \n",
    "world\"\"\"\n",
    "print(sentence)  # 以可读的形式输出对象的内容\n",
    "print(sentence.encode('utf-8')) # 变量提示"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "  dog   cat snake \n    4     3     1 \ncat -> 3:\tcat->3. \ncat->3; \tcat->3, \tdog -> 4:\tdog->4. \ndog->4; \tdog->4, \tsnake -> 1:\tsnake->1. \nsnake->1; \tsnake->1, \t"
     ]
    }
   ],
   "source": [
    "fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])\n",
    "fdist.tabulate()\n",
    "# 三种格式化输出文本的方法\n",
    "# %s(字符串) 和 %d(十进制整数) 为转换说明符\n",
    "for word in sorted(fdist):\n",
    "    print(word, '->', fdist[word], end=':\\t')\n",
    "    print('%s->%d' % (word, fdist[word]), end='. \\n')\n",
    "    print('{}->{}'.format(word, fdist[word]), end='; \\t')  # fromat()函数格式化输出文本\n",
    "    print('{1}->{0}'.format(fdist[word], word), end=', \\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Lee wants a sandwich right now.\nLee wants a spam fritter right now.\nLee wants a pancake right now.\n"
     ]
    }
   ],
   "source": [
    "template = 'Lee wants a {} right now.'\n",
    "menu = ['sandwich', 'spam fritter', 'pancake']\n",
    "for snack in menu:\n",
    "    print(template.format(snack))"
   ]
  },
  {
   "source": [
    "### 3.8.3 排列"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "左边靠齐，6个字符=> |dog   |cat   |man   |\n右边靠齐，6个字符=> |   dog|   cat|   man|\n"
     ]
    }
   ],
   "source": [
    "# 将文本按列排版\n",
    "print(\"左边靠齐，6个字符=> |{:6}|{:6}|{:6}|\".format('dog', 'cat', 'man'))\n",
    "print(\"右边靠齐，6个字符=> |{:>6}|{:>6}|{:>6}|\".format('dog', 'cat', 'man'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3.1416\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "\n",
    "# 浮点数，小数点后4位\n",
    "print('{:.4f}'.format(math.pi))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "accuracy for 9375 words: 34.1867%\n"
     ]
    }
   ],
   "source": [
    "count, total = 3205, 9375\n",
    "# 百分数，小数点后4位\n",
    "print('accuracy for {} words: {:.4%}'.format(total, count / total))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ex3-5 布朗语料库中情态动词在不同类别中的频率统计\n",
    "def tabulate(cfdist, words, categories):\n",
    "    print('{:16}'.format('Category'), end=' ')\n",
    "    for word in words:  # 不同情态动词的题头\n",
    "        print('{:>6}'.format(word), end=' ')\n",
    "    print()\n",
    "    for category in categories:  # 不同类别\n",
    "        print('{:16}'.format(category), end=' ')\n",
    "        for word in words:  # 不同情态动词\n",
    "            print('{:6}'.format(cfdist[category][word]), end=' ')\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "cfd['news']=  <FreqDist with 14394 samples and 100554 outcomes>\ncfd['news']['can']=  93\nCategory            can  could    may  might   must   will \nnews                 93     86     66     38     50    389 \nreligion             82     59     78     12     54     71 \nhobbies             268     58    131     22     83    264 \nscience_fiction      16     49      4     12      8     16 \nromance              74    193     11     51     45     43 \nhumor                16     30      8      8      9     13 \n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import brown\n",
    "\n",
    "cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()\n",
    "                               for word in brown.words(categories=genre))\n",
    "\n",
    "genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']\n",
    "modals = ['can', 'could', 'may', 'might', 'must', 'will']\n",
    "print(\"cfd['news']= \", cfd['news'])\n",
    "print(\"cfd['news']['can']= \", cfd['news']['can'])\n",
    "tabulate(cfd, modals, genres)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Monty Python   !\n01234567890123456789\n"
     ]
    }
   ],
   "source": [
    "# 通过使用变量指定字段的宽度\n",
    "print('{:{width}}'.format('Monty Python', width=15) + '!')\n",
    "print(''.join([str(i) for i in range(10)])*2)"
   ]
  },
  {
   "source": [
    "### 3.9.4 将结果写入文件(P130)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输出文件的两种方式：print(str,file=output_file), output_file.write(str)\n",
    "# print()输出时默认在行结束时加入了换行符\n",
    "output_file = open('output.txt', 'w')\n",
    "words = set(nltk.corpus.genesis.words('english-kjv.txt'))\n",
    "for word in sorted(words):\n",
    "    print(word, file=output_file)\n",
    "print(str(len(words)), file=output_file)\n",
    "output_file.write('zYx.Tom')  # 返回写入的字符个数\n",
    "output_file.write(str(len(words)) + '\\n')  # 没有'\\n'则会连续写，不换行\n",
    "output_file.flush()  # 刷新写文件缓冲区\n",
    "output_file.close()"
   ]
  },
  {
   "source": [
    "### 3.9.5 文本换行(Text Wrapping)(P131)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "After (5) all (3) is (2) said (4) and (3) done (4) , (1) more (4) is (2) said (4) than (4) done (4) . (1) "
     ]
    }
   ],
   "source": [
    "# 文本过长，到行尾溢出\n",
    "saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.']\n",
    "\n",
    "for word in saying:\n",
    "    print(word, '(' + str(len(word)) + ')', end=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >%s_(%d)< ---------------\nAfter_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),\nmore_(4), is_(2), said_(4), than_(4), done_(4), ._(1)\n"
     ]
    }
   ],
   "source": [
    "# 文本显示时自动换行\n",
    "from textwrap import fill\n",
    "\n",
    "format = '%s_(%d)'\n",
    "pieces = [format % (word, len(word)) for word in saying]\n",
    "output = ', '.join(pieces)\n",
    "wrapped = fill(output)  # 自动换行显示\n",
    "show_subtitle(format)\n",
    "print(wrapped)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >{}_({})< ---------------\nAfter_(5), all_(3), is_(2), said_(4), and_(3), done_(4), ,_(1),\nmore_(4), is_(2), said_(4), than_(4), done_(4), ._(1)\n"
     ]
    }
   ],
   "source": [
    "format = '{}_({})'\n",
    "pieces = [f'{word}_({len(word)})' for word in saying]\n",
    "output = ', '.join(pieces)\n",
    "wrapped = fill(output)  # 自动换行显示\n",
    "show_subtitle(format)\n",
    "print(wrapped)"
   ]
  },
  {
   "source": [
    "## 3.10 小结\n",
    "\n",
    "-   字符串中的字符是使用索引来访问的，索引从零开始计数(`str[0]`)\n",
    "-   子字符串使用切片符号访问(`str[3:5]`)\n",
    "-   字符串可以被分割成链表(`str.split()`);链表还可以连接成字符串`''.join(list)`。\n",
    "-   文本可以从文件中读取，也可以从URL地址中读取。\n",
    "-   分词是将文本分割成基本单位或者标记，例如：词和标点符号等。基于空格符的分词无法满足应用需要。\n",
    "-   词形归并是一个过程，将一个词的各种形式遇到这个词的标准形式或者引用形式，也称为词位或者词元。\n",
    "-   正则表达式是用来指定模式的方法，re.findall() 可以找到一个字符串中匹配一个模式的所有子字符串。\n",
    "-   在正则字符串前加上前缀`r`，提醒 Python 这个是正则表达式的字符串，不要处理包含的反斜杠。\n",
    "-   字符串格式化表达式包含格式字符串及转换标识符。"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ]
}